In [1]:
import wandb
import pandas as pd
import ydata_profiling
/Users/thierrygrimm/TorchStudio/python/envs/mlflow-bfab3c62e4539be1cc3154fca55686ec122e435e/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
/Users/thierrygrimm/TorchStudio/python/envs/mlflow-bfab3c62e4539be1cc3154fca55686ec122e435e/lib/python3.11/site-packages/numba/core/decorators.py:262: NumbaDeprecationWarning: numba.generated_jit is deprecated. Please see the documentation at: https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-generated-jit for more information and advice on a suitable replacement.
  warnings.warn(msg, NumbaDeprecationWarning)
/Users/thierrygrimm/TorchStudio/python/envs/mlflow-bfab3c62e4539be1cc3154fca55686ec122e435e/lib/python3.11/site-packages/visions/backends/shared/nan_handling.py:50: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  @nb.jit
In [2]:
run = wandb.init(project="nyc_airbnb", group="eda", save_code=True)
wandb: Currently logged in as: thierrygrimm. Use `wandb login --relogin` to force relogin
Tracking run with wandb version 0.15.8
Run data is saved locally in /Users/thierrygrimm/Library/CloudStorage/OneDrive-Persönlich/Machine Learning/Udacity projects/ml-pipeline-rental-prices/src/eda/wandb/run-20230802_120248-trr8n7pp
Syncing run ruby-lake-4 to Weights & Biases (docs)
View project at https://wandb.ai/thierrygrimm/nyc_airbnb
View run at https://wandb.ai/thierrygrimm/nyc_airbnb/runs/trr8n7pp
In [3]:
local_path = wandb.use_artifact("sample.csv:latest").file()
df = pd.read_csv(local_path)
In [4]:
df.head()
Out[4]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 9138664 Private Lg Room 15 min to Manhattan 47594947 Iris Queens Sunnyside 40.74271 -73.92493 Private room 74 2 6 2019-05-26 0.13 1 5
1 31444015 TIME SQUARE CHARMING ONE BED IN HELL'S KITCHEN... 8523790 Johlex Manhattan Hell's Kitchen 40.76682 -73.98878 Entire home/apt 170 3 0 NaN NaN 1 188
2 8741020 Voted #1 Location Quintessential 1BR W Village... 45854238 John Manhattan West Village 40.73631 -74.00611 Entire home/apt 245 3 51 2018-09-19 1.12 1 0
3 34602077 Spacious 1 bedroom apartment 15min from Manhattan 261055465 Regan Queens Astoria 40.76424 -73.92351 Entire home/apt 125 3 1 2019-05-24 0.65 1 13
4 23203149 Big beautiful bedroom in huge Bushwick apartment 143460 Megan Brooklyn Bushwick 40.69839 -73.92044 Private room 65 2 8 2019-06-23 0.52 2 8
In [5]:
profile = ydata_profiling.ProfileReport(df)
In [8]:
profile.to_notebook_iframe()
Summarize dataset: 100%|█████| 126/126 [00:06<00:00, 19.44it/s, Completed]
Generate report structure: 100%|████████████| 1/1 [00:03<00:00,  3.21s/it]
Render HTML: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.18s/it]
In [9]:
# Drop outliers
min_price = 10
max_price = 350
idx = df['price'].between(min_price, max_price)
df = df[idx].copy()

# Convert last_review to datetime
df['last_review'] = pd.to_datetime(df['last_review'])
In [10]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 19001 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              19001 non-null  int64         
 1   name                            18994 non-null  object        
 2   host_id                         19001 non-null  int64         
 3   host_name                       18993 non-null  object        
 4   neighbourhood_group             19001 non-null  object        
 5   neighbourhood                   19001 non-null  object        
 6   latitude                        19001 non-null  float64       
 7   longitude                       19001 non-null  float64       
 8   room_type                       19001 non-null  object        
 9   price                           19001 non-null  int64         
 10  minimum_nights                  19001 non-null  int64         
 11  number_of_reviews               19001 non-null  int64         
 12  last_review                     15243 non-null  datetime64[ns]
 13  reviews_per_month               15243 non-null  float64       
 14  calculated_host_listings_count  19001 non-null  int64         
 15  availability_365                19001 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(7), object(5)
memory usage: 2.5+ MB
In [ ]:
run.finish()